Preprocessing

Data is extracted and cleaned using Python. Some transformations such as datetimes to days since baseline are also done in Python, and saved into a config spreadsheet.

R reads the cleaned data from the spreadsheet and uses this to: * Create a graph structure * Make the data into a JAGS-readable format

configsheets = excel_sheets(configpath)
for (sheet in configsheets) {
  assign(sheet, read_excel(configpath, sheet))
}
regression_df = read_excel(regdatapath) %>%
  mutate(date_numeric=ifelse(date_numeric>0, date_numeric, NA))  # remove dates before baseline
## Warning: package 'bindrcpp' was built under R version 3.4.4
today_numeric = (Sys.time() - base_datetime) %>% as.numeric()

# assign unique facility IDs
well_names = c(regression_df$well) %>% unique()
fp_names = c(well_fp_map$fp, fp_gen_map$fp,fp_constants$fp) %>% unique()
fluid_types = c('ip', 'lp', 'w')
gen_names = gen_constants$gen %>% unique() %>% sort()
ip_gen_names = paste(gen_names, 'ip', sep='_')
lp_gen_names = paste(gen_names, 'lp', sep='_')
w_gen_names = paste(gen_names, 'w', sep='_')
dummy_gen_names = c(ip_gen_names, lp_gen_names, w_gen_names) %>% sort()
all_names = c('DUMMY', well_names, fp_names, dummy_gen_names, gen_names)
ids = 1:length(all_names)
names(ids) = all_names

# replace names in data with IDs
regression_df = regression_df %>% mutate(well_id=ids[well]) %>% select(-well)
operating_conditions = operating_conditions %>% mutate(well_id=ids[well]) %>% rename(whp_pred=whp)
fp_constants = fp_constants %>% mutate(fp_id=ids[fp]) %>% select(-fp)
gen_constants = gen_constants %>% mutate(gen_id=ids[gen]) %>% select(-gen)
well_fp_map = well_fp_map %>% mutate(well_id=ids[well], fp_id=ids[fp]) %>% select(-c(well, fp))
fp_gen_map = fp_gen_map %>% mutate(fp_id=ids[fp], gen_ip_id=ids[gen_ip], gen_lp_id=ids[gen_lp], gen_w_id=ids[gen_w]) %>% select(-c(fp, gen_ip, gen_lp, gen_w))

Graph

# create connectivity matrix. i flows to j
# wells to FPs
v = matrix(0, nrow=length(ids), ncol=length(ids))
v[1,] = 1
for (i in 1:nrow(well_fp_map)) {
  id_i = well_fp_map[[i, 1]]
  id_j = well_fp_map[[i, 2]]
  v[id_i, id_j] = 1
}
# send ip/lp/w flows to dummy gens
for (i in 1:nrow(fp_gen_map)) {
  id_i = fp_gen_map[[i, 1]]
  for (j in 2:ncol(fp_gen_map)) {
    facility_j = names(ids)[fp_gen_map[[i, j]]]
    facility_dummy_j = paste(facility_j, fluid_types[j-1], sep='_')
    id_j = ids[facility_dummy_j]
    if (!is.na(id_j)) {
      v[id_i, id_j] = 1
    }
  }
}
# dummy gens to gens
for (i in 1:nrow(gen_constants)) {
  id_j = gen_constants$gen_id[i]
  facility_j = names(ids)[id_j]
  for (fluid in fluid_types) {
    facility_dummy_i = paste(facility_j, fluid, sep='_')
    id_i = ids[facility_dummy_i]
    v[id_i, id_j] = 1
  }
}

# convert form
m = matrix(0, nrow=nrow(v), ncol=max(colSums(v)))
for (i in 1:nrow(v)) {
  for (j in 1:ncol(v)) {
    if (v[[i, j]]==1) {
      m[j, sum(m[j,]>0)+1] = i
    }
  }
}

# generate coordinates
dummy_locs = data.frame(name='DUMMY', x=-0.1, y=0)
well_locs = data.frame(name=well_names, x=0, y=seq(1/(length(well_names)-1), 1, length.out=length(well_names)))
fp_locs = data.frame(name=fp_names, x=1, y=seq(0, 1, length.out=length(fp_names)))
gen_dummy_locs = data.frame(name=dummy_gen_names, x=2, y=seq(0, 1, length.out=length(dummy_gen_names)))
gen_locs = data.frame(name=gen_names, x=2.5, y=seq(1/11, 10/11, length.out=length(gen_names)))
locs = rbind(dummy_locs, well_locs, fp_locs, gen_dummy_locs, gen_locs)
locs$id = ids[locs$name]
locs = locs %>% arrange(id)

g = graph_from_adjacency_matrix(v) %>%
  set_vertex_attr('label', value=as.vector(locs$name)) %>%
  set_vertex_attr('x', value=as.vector(locs$x)) %>%
  set_vertex_attr('y', value=as.vector(locs$y)) %>%
  set_vertex_attr('label.degree', value=pi) %>%
  set_vertex_attr('size', value=8) %>%
  as.undirected()
  
par(mar=c(0,0,0,0))
plot.igraph(g, vertex.label.dist=3)

The dummy node is necessary because when indexing a subset of flows that go into a node, this subset cannot be empty. The dummy node has zero mass flowing out of it.

Data

regression_list = regression_df %>% select(-c(date, h)) %>% as.list()
operating_conditions_list = operating_conditions %>% arrange(well_id) %>% select(whp_pred) %>% as.list()
fp_constants_list = as.list(fp_constants)
gen_constants_list = as.list(gen_constants %>% select(gen_id, factor))
facilities = data.frame(id=1:max(ids)) %>%
  full_join(operating_conditions %>% rename(id=well_id) %>% select(-well), by='id') %>%
  full_join(gen_constants %>% select(factor, id=gen_id), by='id') %>%
  full_join(fp_constants %>% rename(id=fp_id), by='id') %>%
  mutate(mf_pred=NA) %>%
  mutate(n_inflows=colSums(v))
## Warning: Column `id` has different attributes on LHS and RHS of join

## Warning: Column `id` has different attributes on LHS and RHS of join

## Warning: Column `id` has different attributes on LHS and RHS of join
facilities_list = facilities %>% select(-id) %>% as.list()

well_ids = ids[well_names]
fp_ids = ids[fp_names]
ip_gen_ids = ids[ip_gen_names]
lp_gen_ids = ids[lp_gen_names]
w_gen_ids = ids[w_gen_names]
gen_ids = ids[gen_names]

# insert production curve predictions
prod = data.frame(whp_prod=seq(7, 12, length.out=10),
                  well_id_prod=9)
prod_list = prod %>% as.list()

data = c(regression_list, facilities_list, prod_list,
         list(well_ids=well_ids, fp_ids=fp_ids, gen_ids=gen_ids,
              ip_gen_ids=ip_gen_ids, lp_gen_ids=lp_gen_ids, w_gen_ids=w_gen_ids,
              today_numeric=today_numeric, m=m, dummy=1))

Model

code = "
model {
  #######################################
  # fit individual regressions to wells #
  #######################################
  for (i in 1:length(whp)) {
    # elliptic
    # mu2[i] <- beta_whp[well_id[i]] * whp[i]^2 + beta_date[well_id[i]] * date_numeric[i]^2
    # mu[i] <- sqrt(max(mu2[i], 0)) + Intercept[well_id[i]]
    # linear
    mu[i] <- Intercept[well_id[i]] + beta_whp[well_id[i]] * whp[i] + beta_date[well_id[i]] * date_numeric[i]

    mf[i] ~ dnorm(mu[i], tau[well_id[i]])
  }
  
  # HIERARCHICAL
  # fills in for any missing wells
  for (j in well_ids) {
    Intercept[j] ~ dnorm(mu_Intercept, tau_Intercept)
    beta_whp[j] ~ dnorm(mu_beta_whp, tau_beta_whp)
    beta_date[j] ~ dnorm(mu_date, tau_date)
    tau[j] ~ dgamma(1e-8, 1e-8)
  }
  # fill in any missing dates
  for (i in 1:length(whp)) {
    date_numeric[i] ~ dnorm(mu_date_numeric, tau_date_numeric)
  }
  mu_date_numeric ~ dnorm(0, 1e-8)
  tau_date_numeric ~ dnorm(1e-8, 1e-8)
  
  # set hyperparameters
  mu_Intercept ~ dnorm(0, 1e-8)
  mu_beta_whp ~ dnorm(0, 1e-12)
  mu_date ~ dnorm(0, 1e-8)
  tau_Intercept ~ dgamma(1e-8, 1e-8)
  tau_beta_whp ~ dgamma(1e-8, 1e-8)
  tau_date ~ dgamma(1e-8, 1e-8)

  #####################################
  # production curve for verification #
  #####################################
  for (i in 1:length(whp_prod)) {
    # elliptic
    # mf_prod2[i] <- beta_whp[well_id_prod[i]] * whp_prod[i]^2 + beta_date[well_id_prod[i]] * today_numeric^2
    # mf_prod[i] <- sqrt(max(mf_prod2[i], 0)) + Intercept[well_id_prod[i]]
    # linear
    mf_prod[i] <- Intercept[well_id_prod[i]] + beta_whp[well_id_prod[i]] * whp_prod[i] + beta_date[well_id_prod[i]] * today_numeric
  }

  ######################################################
  # simple model to fill in missing enthalpy constants #
  ######################################################
  for (i in fp_ids) {
    # missing fp constants
    hf_ip[i] ~ dgamma(param[1], param[7])
    hg_ip[i] ~ dgamma(param[2], param[8])
    hfg_ip[i] ~ dgamma(param[3], param[9])
    hf_lp[i] ~ dgamma(param[4], param[10])
    hg_lp[i] ~ dgamma(param[5], param[11])
    hfg_lp[i] ~ dgamma(param[6], param[12])
  }
  for (i in c(1, well_ids)) { h[i] ~ dgamma(param[13], param[14]) } # missing well constants
  for (i in 1:14) { param[i] ~ dgamma(1e-8, 1e-8) }                 # uniform priors

  ########################################
  # make predictions (the stuff we want) #
  ########################################
  mf_pred[dummy] <- 0  # dummy well
  ip_sf[dummy] <- 0
  lp_sf[dummy] <- 0
  wf[dummy] <- 0
  
  for (i in well_ids) {
    # elliptic
    # mf_pred2[i] <- beta_whp[i] * whp_pred[i]^2 + beta_date[i] * today_numeric^2
    # mf_pred[i] <- sqrt(max(mf_pred2[i], 0)) + Intercept[i]
    # linear
    mf_pred[i] <- Intercept[i] + beta_whp[i] * whp_pred[i] + beta_date[i] * today_numeric
  }
  for (i in fp_ids) {
    mf_pred[i] <- sum(mf_pred[m[i,1:n_inflows[i]]])
    h[i] <- sum(mf_pred[m[i, 1:n_inflows[i]]] * h[m[i, 1:n_inflows[i]]]) / ifelse(mf_pred[i]!=0, mf_pred[i], 1)
    ip_sf[i] <- (h[i] - hf_ip[i]) / hfg_ip[i] * mf_pred[i]
    lp_sf[i] <- (hf_ip[i] - hf_lp[i]) / hfg_lp[i] * (mf_pred[i] - ip_sf[i])
    total_sf[i] <- ip_sf[i] + lp_sf[i]
    wf[i] <- total_sf[i]
  }
  # dummy gens and actual gens
  for (i in ip_gen_ids) { mf_pred[i] <- sum(ip_sf[m[i, 1:n_inflows[i]]]) }
  for (i in lp_gen_ids) { mf_pred[i] <- sum(lp_sf[m[i, 1:n_inflows[i]]]) }
  for (i in w_gen_ids) { mf_pred[i] <- sum(wf[m[i, 1:n_inflows[i]]]) }
  for (i in gen_ids) {
    mf_pred[i] <- sum(mf_pred[m[i,1:n_inflows[i]]])
    power[i] <- mf_pred[i] / mu_factor[i]
    mu_factor[i] ~ dunif(0.95*factor[i], 1.05*factor[i])  # uncertainty from email
  }
  total_power <- sum(power[gen_ids])
}
"

vars =  c(paste0('mf_pred[', 2:length(data$mf_pred), ']'), 
          paste0('power[', gen_ids, ']'),
          paste0('h[', fp_ids, ']'),
          paste0('mf_prod[', 1:length(data$whp_prod), ']'))
n_chains = 4
burn_in = 200
n_steps = 1000

model = jags.model(textConnection(code), data, n.chains=n_chains)
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 1565
##    Unobserved stochastic nodes: 146
##    Total graph size: 6026
## 
## Initializing model
update(model, burn_in)
out = coda.samples(model, n.iter=n_steps, variable.names=vars)
outmatrix = as.matrix(out)
outframe = as.data.frame(outmatrix) %>%
  gather(key=facility, value=value) %>%
  mutate(variable=gsub("\\[.*$", "", facility), facility=parse_number(facility))
outframe$facility = names(ids)[outframe$facility]

Posteriors

g1 = ggplot(outframe %>% filter(facility %in% well_names, variable=="mf_pred", value>0), aes(x=value, fill=facility)) +
  geom_density(alpha=0.5, color=NA) + xlim(0, NA) +labs(x="Mass flow")

g2 = ggplot(outframe %>% filter(facility %in% fp_names, variable=="mf_pred", value>0), aes(x=value, fill=facility)) +
  geom_density(alpha=0.5, color=NA) + xlim(0, NA) + labs(x="Mass flow")

g3 = ggplot(outframe %>% filter(facility %in% gen_names, variable=="mf_pred", value>0), aes(x=value, fill=facility)) +
  geom_density(alpha=0.5, color=NA) + xlim(0, NA) + labs(x="Mass flow")

g4 = ggplot(outframe %>% filter(facility %in% gen_names, variable=="power", value>0), aes(x=value, fill=facility)) +
  geom_density(alpha=0.5, color=NA) + xlim(0, NA) + labs(x="Power")

ggplotly(g1, tooltip=c('facility', 'value'))
ggplotly(g2, tooltip=c('facility', 'value'))
p3 = ggplotly(g3, tooltip=c('facility', 'value'))
p4 = ggplotly(g4, tooltip=c('facility', 'value'))
subplot(p3, p4)
prod = as.data.frame(outmatrix) %>%
  select(contains('prod')) %>%
  gather(key=facility, value=value) %>%
  mutate(which=parse_number(facility)) %>%
  mutate(whp=data$whp_prod[which]) %>%
  rename(mf=value)

plotdata = regression_df %>%
  filter(well_id==9) %>%
  mutate(datetime = factor(as.Date(date))) %>%
  group_by(datetime) %>%
  filter(n()>2) %>%
  mutate(mf2 = mf^2)

mylm = lm(mf2 ~ datetime * I(whp^2) + datetime +0, data=plotdata)
datetime = plotdata$datetime %>% unique()
whp = seq(0, 16, 0.01)
reglines = expand.grid(datetime=datetime, whp=whp)
reglines$mf2 <- unname(predict(mylm, reglines))
reglines$mf <- sqrt(reglines$mf2)
## Warning in sqrt(reglines$mf2): NaNs produced
reglines$date = as.POSIXct(reglines$datetime)

ggplot(prod, aes(x=whp, y=mf)) +
  geom_jitter(width=0, alpha=0.01) +
  geom_quantile() +
  geom_line(data=reglines, aes(linetype=datetime, color=date)) +
  geom_point(data=plotdata, aes(color=date)) +
  labs(color="Date", linetype="Fitted curve") +
  xlim(0, NA) + ylim(0, NA)
## Smoothing formula not specified. Using: y ~ x
## Warning: Removed 1599 rows containing missing values (geom_path).